{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "#进行数据集的分片\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "import csv\n",
    "import pandas as pd\n",
    "import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Begin Time : 2020-02-01 15:42:12\n",
      "End Time : 2020-02-01 15:42:14\n"
     ]
    }
   ],
   "source": [
    "begin_time = time.time()\n",
    "print(f'Begin Time : {time.strftime(\"%Y-%m-%d %H:%M:%S\", time.localtime(begin_time))}')\n",
    "\n",
    "file = './out_put/encode_data.csv'\n",
    "df = pd.read_csv(file, index_col = 0)\n",
    "\n",
    "end_time = time.time()\n",
    "print(f'End Time : {time.strftime(\"%Y-%m-%d %H:%M:%S\", time.localtime(end_time))}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Begin Time : 2020-02-01 15:54:53\n",
      "End Time : 2020-02-01 15:54:54\n"
     ]
    }
   ],
   "source": [
    "#对编码之后的数据进行分片\n",
    "begin_time = time.time()\n",
    "print(f'Begin Time : {time.strftime(\"%Y-%m-%d %H:%M:%S\", time.localtime(begin_time))}')\n",
    "\n",
    "#将数据分为train/validata/testdata三部分\n",
    "df_1 = df[df['click'] == 1]\n",
    "df_0 = df[df['click'] == 0]\n",
    "\n",
    "df_1_test =df_1.sample(frac=0.3, random_state=100)\n",
    "df_0_test =df_0.sample(frac=0.3, random_state=100)\n",
    "\n",
    "df_1_other = df_1[~df_1.index.isin(df_1_test.index)]\n",
    "df_0_other = df_0[~df_0.index.isin(df_0_test.index)]\n",
    "\n",
    "df_1_vali = df_1_other.sample(frac=0.2, random_state=100)\n",
    "df_0_vali = df_0_other.sample(frac=0.2, random_state=100)\n",
    "\n",
    "df_1_train = df_1_other[~df_1_other.index.isin(df_1_vali.index)]\n",
    "df_0_train = df_0_other[~df_0_other.index.isin(df_0_vali.index)]\n",
    "\n",
    "#合并1/0\n",
    "df_train = pd.concat([df_1_train,df_0_train], ignore_index=True)\n",
    "df_vali = pd.concat([df_1_vali,df_0_vali], ignore_index=True)\n",
    "df_test = pd.concat([df_1_test,df_0_test], ignore_index=True)\n",
    "\n",
    "print(f'--split data : {time.strftime(\"%Y-%m-%d %H:%M:%S\", time.localtime(time.time()))}')\n",
    "\n",
    "nums_train = df_train['click'].count()\n",
    "nums_vali = df_train['click'].count()\n",
    "nums_test = df_train['click'].count()\n",
    "\n",
    "print(f'--split rate train VS vali VS test: {nums_train}:{nums_vali}:{nums_test}')\n",
    "\n",
    "df_train.to_csv('./out_put/encode_data_train.csv')\n",
    "df_train.to_csv('./out_put/encode_data_train.csv')\n",
    "df_train.to_csv('./out_put/encode_data_train.csv')\n",
    "\n",
    "end_time = time.time()\n",
    "print(f'End Time : {time.strftime(\"%Y-%m-%d %H:%M:%S\", time.localtime(end_time))}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>click</th>\n",
       "      <th>banner_pos</th>\n",
       "      <th>device_type</th>\n",
       "      <th>device_conn_type</th>\n",
       "      <th>C1</th>\n",
       "      <th>C14</th>\n",
       "      <th>C15</th>\n",
       "      <th>C16</th>\n",
       "      <th>C17</th>\n",
       "      <th>C18</th>\n",
       "      <th>...</th>\n",
       "      <th>C21</th>\n",
       "      <th>site_id_enconde</th>\n",
       "      <th>site_domain_enconde</th>\n",
       "      <th>site_category_enconde</th>\n",
       "      <th>app_id_enconde</th>\n",
       "      <th>app_domain_enconde</th>\n",
       "      <th>app_category_enconde</th>\n",
       "      <th>device_id_enconde</th>\n",
       "      <th>device_ip_enconde</th>\n",
       "      <th>device_model_enconde</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1005</td>\n",
       "      <td>15707</td>\n",
       "      <td>320</td>\n",
       "      <td>50</td>\n",
       "      <td>1722</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>79</td>\n",
       "      <td>260</td>\n",
       "      <td>1927</td>\n",
       "      <td>2</td>\n",
       "      <td>2127</td>\n",
       "      <td>73</td>\n",
       "      <td>0</td>\n",
       "      <td>55297</td>\n",
       "      <td>67996</td>\n",
       "      <td>1684</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1005</td>\n",
       "      <td>15701</td>\n",
       "      <td>320</td>\n",
       "      <td>50</td>\n",
       "      <td>1722</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>79</td>\n",
       "      <td>260</td>\n",
       "      <td>1927</td>\n",
       "      <td>2</td>\n",
       "      <td>2127</td>\n",
       "      <td>73</td>\n",
       "      <td>0</td>\n",
       "      <td>55297</td>\n",
       "      <td>242399</td>\n",
       "      <td>2278</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1005</td>\n",
       "      <td>21611</td>\n",
       "      <td>320</td>\n",
       "      <td>50</td>\n",
       "      <td>2480</td>\n",
       "      <td>3</td>\n",
       "      <td>...</td>\n",
       "      <td>61</td>\n",
       "      <td>1110</td>\n",
       "      <td>1568</td>\n",
       "      <td>6</td>\n",
       "      <td>789</td>\n",
       "      <td>22</td>\n",
       "      <td>3</td>\n",
       "      <td>50522</td>\n",
       "      <td>51233</td>\n",
       "      <td>4233</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1005</td>\n",
       "      <td>15708</td>\n",
       "      <td>320</td>\n",
       "      <td>50</td>\n",
       "      <td>1722</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>79</td>\n",
       "      <td>260</td>\n",
       "      <td>1927</td>\n",
       "      <td>2</td>\n",
       "      <td>2127</td>\n",
       "      <td>73</td>\n",
       "      <td>0</td>\n",
       "      <td>55297</td>\n",
       "      <td>200821</td>\n",
       "      <td>1471</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1005</td>\n",
       "      <td>18993</td>\n",
       "      <td>320</td>\n",
       "      <td>50</td>\n",
       "      <td>2161</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>157</td>\n",
       "      <td>1110</td>\n",
       "      <td>1568</td>\n",
       "      <td>6</td>\n",
       "      <td>100</td>\n",
       "      <td>41</td>\n",
       "      <td>21</td>\n",
       "      <td>55297</td>\n",
       "      <td>252321</td>\n",
       "      <td>804</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 22 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    click  banner_pos  device_type  device_conn_type    C1    C14  C15  C16  \\\n",
       "8       1           0            1                 2  1005  15707  320   50   \n",
       "23      1           0            1                 0  1005  15701  320   50   \n",
       "27      1           0            1                 0  1005  21611  320   50   \n",
       "32      1           0            1                 0  1005  15708  320   50   \n",
       "37      1           0            1                 0  1005  18993  320   50   \n",
       "\n",
       "     C17  C18          ...           C21  site_id_enconde  \\\n",
       "8   1722    0          ...            79              260   \n",
       "23  1722    0          ...            79              260   \n",
       "27  2480    3          ...            61             1110   \n",
       "32  1722    0          ...            79              260   \n",
       "37  2161    0          ...           157             1110   \n",
       "\n",
       "    site_domain_enconde  site_category_enconde  app_id_enconde  \\\n",
       "8                  1927                      2            2127   \n",
       "23                 1927                      2            2127   \n",
       "27                 1568                      6             789   \n",
       "32                 1927                      2            2127   \n",
       "37                 1568                      6             100   \n",
       "\n",
       "    app_domain_enconde  app_category_enconde  device_id_enconde  \\\n",
       "8                   73                     0              55297   \n",
       "23                  73                     0              55297   \n",
       "27                  22                     3              50522   \n",
       "32                  73                     0              55297   \n",
       "37                  41                    21              55297   \n",
       "\n",
       "    device_ip_enconde  device_model_enconde  \n",
       "8               67996                  1684  \n",
       "23             242399                  2278  \n",
       "27              51233                  4233  \n",
       "32             200821                  1471  \n",
       "37             252321                   804  \n",
       "\n",
       "[5 rows x 22 columns]"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_1_other.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>click</th>\n",
       "      <th>banner_pos</th>\n",
       "      <th>device_type</th>\n",
       "      <th>device_conn_type</th>\n",
       "      <th>C1</th>\n",
       "      <th>C14</th>\n",
       "      <th>C15</th>\n",
       "      <th>C16</th>\n",
       "      <th>C17</th>\n",
       "      <th>C18</th>\n",
       "      <th>...</th>\n",
       "      <th>C21</th>\n",
       "      <th>site_id_enconde</th>\n",
       "      <th>site_domain_enconde</th>\n",
       "      <th>site_category_enconde</th>\n",
       "      <th>app_id_enconde</th>\n",
       "      <th>app_domain_enconde</th>\n",
       "      <th>app_category_enconde</th>\n",
       "      <th>device_id_enconde</th>\n",
       "      <th>device_ip_enconde</th>\n",
       "      <th>device_model_enconde</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1005</td>\n",
       "      <td>15707</td>\n",
       "      <td>320</td>\n",
       "      <td>50</td>\n",
       "      <td>1722</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>79</td>\n",
       "      <td>260</td>\n",
       "      <td>1927</td>\n",
       "      <td>2</td>\n",
       "      <td>2127</td>\n",
       "      <td>73</td>\n",
       "      <td>0</td>\n",
       "      <td>55297</td>\n",
       "      <td>67996</td>\n",
       "      <td>1684</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1005</td>\n",
       "      <td>15701</td>\n",
       "      <td>320</td>\n",
       "      <td>50</td>\n",
       "      <td>1722</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>79</td>\n",
       "      <td>260</td>\n",
       "      <td>1927</td>\n",
       "      <td>2</td>\n",
       "      <td>2127</td>\n",
       "      <td>73</td>\n",
       "      <td>0</td>\n",
       "      <td>55297</td>\n",
       "      <td>242399</td>\n",
       "      <td>2278</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1005</td>\n",
       "      <td>21611</td>\n",
       "      <td>320</td>\n",
       "      <td>50</td>\n",
       "      <td>2480</td>\n",
       "      <td>3</td>\n",
       "      <td>...</td>\n",
       "      <td>61</td>\n",
       "      <td>1110</td>\n",
       "      <td>1568</td>\n",
       "      <td>6</td>\n",
       "      <td>789</td>\n",
       "      <td>22</td>\n",
       "      <td>3</td>\n",
       "      <td>50522</td>\n",
       "      <td>51233</td>\n",
       "      <td>4233</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1005</td>\n",
       "      <td>15708</td>\n",
       "      <td>320</td>\n",
       "      <td>50</td>\n",
       "      <td>1722</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>79</td>\n",
       "      <td>260</td>\n",
       "      <td>1927</td>\n",
       "      <td>2</td>\n",
       "      <td>2127</td>\n",
       "      <td>73</td>\n",
       "      <td>0</td>\n",
       "      <td>55297</td>\n",
       "      <td>200821</td>\n",
       "      <td>1471</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1005</td>\n",
       "      <td>18993</td>\n",
       "      <td>320</td>\n",
       "      <td>50</td>\n",
       "      <td>2161</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>157</td>\n",
       "      <td>1110</td>\n",
       "      <td>1568</td>\n",
       "      <td>6</td>\n",
       "      <td>100</td>\n",
       "      <td>41</td>\n",
       "      <td>21</td>\n",
       "      <td>55297</td>\n",
       "      <td>252321</td>\n",
       "      <td>804</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 22 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    click  banner_pos  device_type  device_conn_type    C1    C14  C15  C16  \\\n",
       "8       1           0            1                 2  1005  15707  320   50   \n",
       "23      1           0            1                 0  1005  15701  320   50   \n",
       "27      1           0            1                 0  1005  21611  320   50   \n",
       "32      1           0            1                 0  1005  15708  320   50   \n",
       "37      1           0            1                 0  1005  18993  320   50   \n",
       "\n",
       "     C17  C18          ...           C21  site_id_enconde  \\\n",
       "8   1722    0          ...            79              260   \n",
       "23  1722    0          ...            79              260   \n",
       "27  2480    3          ...            61             1110   \n",
       "32  1722    0          ...            79              260   \n",
       "37  2161    0          ...           157             1110   \n",
       "\n",
       "    site_domain_enconde  site_category_enconde  app_id_enconde  \\\n",
       "8                  1927                      2            2127   \n",
       "23                 1927                      2            2127   \n",
       "27                 1568                      6             789   \n",
       "32                 1927                      2            2127   \n",
       "37                 1568                      6             100   \n",
       "\n",
       "    app_domain_enconde  app_category_enconde  device_id_enconde  \\\n",
       "8                   73                     0              55297   \n",
       "23                  73                     0              55297   \n",
       "27                  22                     3              50522   \n",
       "32                  73                     0              55297   \n",
       "37                  41                    21              55297   \n",
       "\n",
       "    device_ip_enconde  device_model_enconde  \n",
       "8               67996                  1684  \n",
       "23             242399                  2278  \n",
       "27              51233                  4233  \n",
       "32             200821                  1471  \n",
       "37             252321                   804  \n",
       "\n",
       "[5 rows x 22 columns]"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_1.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
